Package org.terrier.structures.indexing

Source Code of org.terrier.structures.indexing.PostingVisitor

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org/
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is BlockFieldDocumentPostingList.java
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original contributor)
*/
package org.terrier.structures.indexing;

import gnu.trove.THashMap;
import gnu.trove.TIntHashSet;
import gnu.trove.TIntObjectHashMap;
import gnu.trove.TObjectIntProcedure;

import java.util.Arrays;

import org.terrier.sorting.HeapSortInt;
import org.terrier.structures.postings.BlockFieldPostingImpl;
import org.terrier.structures.postings.BlockPosting;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.structures.postings.WritablePosting;
import org.terrier.utility.TermCodes;
/** BlockFieldDocumentPostingList class */
public class BlockFieldDocumentPostingList extends FieldDocumentPostingList {
  /** mapping term to blockids in this document */
  protected final THashMap<String, TIntHashSet> term_blocks = new THashMap<String, TIntHashSet>(AVG_DOCUMENT_UNIQUE_TERMS);
  /** number of blocks in this document. usually equal to document length, but perhaps less */
  protected int blockCount = 0;
   
  /**
   * Constructs an instance of BlockFieldDocumentPostingList.
   * @param NUM_FIELDS
   */
  public BlockFieldDocumentPostingList(int NUM_FIELDS) {
    super(NUM_FIELDS);
  }
  /**
   * Get the blocks for the specified term
   */
  public int[] getBlocks(String term)
  {
    int[] rtr = term_blocks.get(term).toArray();
    if (rtr ==  null)
      return new int[0];
    Arrays.sort(rtr);
    return rtr;
  }

  /** Insert a term into this document, occurs at given block id, and in the given field */
  public void insert(String t, int fieldId, int blockId)
  {
    super.insert(t, fieldId);
    TIntHashSet blockids = null;
    if ((blockids = term_blocks.get(t)) == null)
    {
      term_blocks.put(t, blockids = new TIntHashSet(/*TODO */));
    }
    blockids.add(blockId);
    blockCount++;
  }

  /** Insert a term into this document, occurs at given block id, and in the given fields */
  public void insert(String t, int[] fieldIds, int blockId)
  {
    super.insert(t, fieldIds);
    TIntHashSet blockids = null;
    if ((blockids = term_blocks.get(t)) == null)
    {
      term_blocks.put(t, blockids = new TIntHashSet(/*TODO */));
    }
    blockids.add(blockId);
    blockCount++;
  }

  /** Insert a term into this document tf times, occurs at given block id, and in the given fields */
  public void insert(int tf, String t, int[] fieldIds, int blockId)
  {
    super.insert(tf, t, fieldIds);
    TIntHashSet blockids = null;
    if ((blockids = term_blocks.get(t)) == null)
    {
      term_blocks.put(t, blockids = new TIntHashSet(/*TODO */));
    }
    blockids.add(blockId);
    blockCount++;
  }
 
  class blockFieldPostings extends fieldPostingIterator implements BlockPosting
  {
    public blockFieldPostings(String[] _terms, int[] ids) {
      super(_terms, ids);
    }   
   
    /** {@inheritDoc} */
    public int[] getPositions() {
      int[] blockIds = term_blocks.get(terms[i]).toArray();
      Arrays.sort(blockIds);
      return blockIds;
    }

    @Override
    public WritablePosting asWritablePosting() {
      BlockFieldPostingImpl fbp = new BlockFieldPostingImpl(termIds[i], getFrequency(), getPositions(), fieldCount);
      System.arraycopy(getFieldFrequencies(), 0, fbp.getFieldFrequencies(), 0, fieldCount);
      return fbp;
    }   
  }
 
  /** returns the postings suitable to be written into the block direct index */
  public int[][] getPostings()
  {
    final int termCount = occurrences.size();
    final int[] termids = new int[termCount];
    final int[] tfs = new int[termCount];
   
    final int[][] fields = new int[fieldCount][termCount];
    final int[] blockfreqs = new int[termCount];
    final TIntObjectHashMap<int[]> term2blockids = new TIntObjectHashMap<int[]>();
    int blockTotal = 0; //TODO we already have blockTotal as this.blockCount, so no need to count?
   
    class PostingVisitor implements TObjectIntProcedure<String> {
      int blockTotal = 0;
      int i=0;
      public boolean execute(final String a, final int b)
      {
        termids[i] = TermCodes.getCode(a);
        tfs[i] = b;
        for(int fi=0;fi<fieldCount;fi++)
          fields[fi][i] = field_occurrences[fi].get(a);
        final TIntHashSet ids = term_blocks.get(a);
        blockfreqs[i] = ids.size();
        this.blockTotal += ids.size();
        final int[] bids = ids.toArray();
        Arrays.sort(bids);
        term2blockids.put(termids[i], bids);
        i++;
        return true;
      }
    }
    PostingVisitor proc = new PostingVisitor();
    occurrences.forEachEntry(proc);
    blockTotal = proc.blockTotal;
    int[][] tmppostings = new int[3+fieldCount][];
    tmppostings[0] = termids;
    tmppostings[1] = tfs;
    for(int fi=0;fi<fieldCount;fi++)
      tmppostings[fi+2] = fields[fi];
    tmppostings[fieldCount+2] = blockfreqs;
    HeapSortInt.ascendingHeapSort(tmppostings);
    final int[] blockids = new int[blockTotal];
    int offset = 0;
    for (int termid : termids)
    {
      final int[] src = term2blockids.get(termid);
      final int src_l = src.length;
      System.arraycopy(src, 0, blockids, offset, src_l);
      offset+= src_l;
    }
    int[][] postings = new int[4+fieldCount][];
    postings[0] = tmppostings[0];
    postings[1] = tmppostings[1];
    for(int fi=0;fi<fieldCount;fi++)
      postings[fi+2] = fields[fi];
    postings[fieldCount+2] = blockfreqs;
    postings[fieldCount+3] = blockids;
    return postings;
  }

  @Override
  protected IterablePosting makePostingIterator(String[] _terms, int[] termIds)
  {
    return new blockFieldPostings(_terms, termIds);
  }
}
TOP

Related Classes of org.terrier.structures.indexing.PostingVisitor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.